# Import required packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Load the dataset
books = pd.read_csv("books.csv")

# check the first few observations
books.head()

# Inspect the DataFrame
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15719 entries, 0 to 15718
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               15719 non-null  object 
 1   price               15719 non-null  float64
 2   review/helpfulness  15719 non-null  object 
 3   review/summary      15718 non-null  object 
 4   review/text         15719 non-null  object 
 5   description         15719 non-null  object 
 6   authors             15719 non-null  object 
 7   categories          15719 non-null  object 
 8   popularity          15719 non-null  object 
dtypes: float64(1), object(8)
memory usage: 1.1+ MB

# Visualize popularity frequencies
sns.countplot(data=books, x="popularity")
plt.show()

# Check categories
books["categories"].value_counts()

categories
'Fiction'                      3520
'Religion'                     1053
'Biography & Autobiography'     852
'Juvenile Fiction'              815
'History'                       754
                               ... 
'Sunflowers'                      1
'Self-confidence'                 1
'United States'                   1
'Note-taking'                     1
'Asthma'                          1
Name: count, Length: 313, dtype: int64

# Filter out rare categories to avoid overfitting
books = books.groupby("categories").filter(lambda x: len(x) > 100)

# One-hot encoding categories
categories = pd.get_dummies(books["categories"], drop_first=True)

# Bring categories into the DataFrame
books = pd.concat([books, categories], axis=1)

# Remove original column
books.drop(columns=["categories"], inplace=True)

# Get number of total reviews 
books["num_reviews"] = books["review/helpfulness"].str.split("/", expand=True)[1]

# Get number of helpful reviews 
books["num_helpful"] = books["review/helpfulness"].str.split("/", expand=True)[0]

# Convert to integer datatype
for col in ["num_reviews", "num_helpful"]:
    books[col] = books[col].astype(int)


# Add percentage of helpful reviews as a column to normalize the data
books["perc_helpful_reviews"] = books["num_helpful"] / books["num_reviews"]

# Fill null values
books["perc_helpful_reviews"].fillna(0, inplace=True)

# Drop original column
books.drop(columns=["review/helpfulness"], inplace=True)

# Convert strings to lowercase
for col in ["review/summary", "review/text", "description"]:
    books[col] = books[col].str.lower()

# Create a list of positive words to measure positive text sentiment
positive_words = ["great", "excellent", "good", "interesting", "enjoy", "helpful", "useful", "like", "love", "beautiful", "fantastic", "perfect", "wonderful", "impressive", "amazing", "outstanding", "remarkable", "brilliant", "exceptional", "positive",
    "thrilling"]

# Instantiate a CountVectorizer
vectorizer = CountVectorizer(vocabulary=positive_words)

# Fit and transform review/text 
review_text = books["review/text"]
text_transformed = vectorizer.fit_transform(review_text.fillna(''))

# Fit and transform review/summary
review_summary = books["review/summary"]
summary_transformed = vectorizer.fit_transform(review_summary.fillna(''))

# Fit and transform description
description = books["description"]
description_transformed = vectorizer.fit_transform(description.fillna(''))

# Add positive counts into DataFrame to add measures of positive sentiment
books["positive_words_text"] = text_transformed.sum(axis=1).reshape(-1, 1)
books["positive_words_summary"] = summary_transformed.sum(axis=1).reshape(-1, 1)
books["positive_words_description"] = description_transformed.sum(axis=1).reshape(-1, 1)

# Remove original columns
books.drop(columns=["review/text", "review/summary", "description"], inplace=True)

# Splitting into features and target values
X = books.drop(columns=["title", "authors", "popularity"]).values
y = books["popularity"].values.reshape(-1, 1)

# Splitting into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Instantiate and fit a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=120, max_depth=50, min_samples_split=5, random_state=42, class_weight="balanced")
clf.fit(X_train, y_train.ravel()) 

# Evaluate accuracy
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.9617126389460683
0.7090036014405763

	title	price	review/helpfulness	review/summary	review/text	description	authors	categories	popularity
0	We Band of Angels: The Untold Story of America...	10.88	2/3	A Great Book about women in WWII	I have alway been a fan of fiction books set i...	In the fall of 1941, the Philippines was a gar...	'Elizabeth Norman'	'History'	Unpopular
1	Prayer That Brings Revival: Interceding for Go...	9.35	0/0	Very helpful book for church prayer groups and...	Very helpful book to give you a better prayer ...	In Prayer That Brings Revival, best-selling au...	'Yong-gi Cho'	'Religion'	Unpopular
2	The Mystical Journey from Jesus to Christ	24.95	17/19	Universal Spiritual Awakening Guide With Some ...	The message of this book is to find yourself a...	THE MYSTICAL JOURNEY FROM JESUS TO CHRIST Disc...	'Muata Ashby'	'Body, Mind & Spirit'	Unpopular
3	Death Row	7.99	0/1	Ben Kincaid tries to stop an execution.	The hero of William Bernhardt's Ben Kincaid no...	Upon receiving his execution date, one of the ...	'Lynden Harris'	'Social Science'	Unpopular
4	Sound and Form in Modern Poetry: Second Editio...	32.50	18/20	good introduction to modern prosody	There's a lot in this book which the reader wi...	An updated and expanded version of a classic a...	'Harvey Seymour Gross', 'Robert McDowell'	'Poetry'	Unpopular

Project Description¶

Let's create a binary classification model to predict whether a book is rated as "Popular" or "Unpopular"¶